

import os

import pandas as pd

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
import sys
sys.path.append('/home/dalton_m/payload')
from basicfunctions import *

filename = 'ANppp_matchrate'
resultsloc1 = "/dataERS/eract/daltonm/results/ppp/"
from datetime import date
datestr = date.today().strftime(format="%Y%m%d")
resultsloc = resultsloc1 + datestr + '/'
if not os.path.exists(resultsloc):
    os.makedirs(resultsloc)
logging.basicConfig(filename=resultsloc + filename + '.txt', level=logging.ERROR,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('This message should go to the log file')
logging.info('So should this')
logging.warning('And this, too')
logging.exception('And this, too')
logging.captureWarnings(True)
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from pylab import *
import seaborn as sns
sys.path.append("/home/dalton_m/ppp")
from rona_estimatefunctions_gpu import *

import matplotlib.font_manager as fm
mpl.rcParams['font.family']='serif'
cmfont = fm.FontProperties(fname=mpl.get_data_path() + '/fonts/ttf/cmr10.ttf')
mpl.rcParams['font.serif']=cmfont.get_name()
mpl.rcParams['mathtext.fontset']='cm'
mpl.rcParams['axes.unicode_minus']=False
sns.set_style('darkgrid', {'font.family' : 'serif',
                                   'font_serif':cmfont.get_name(),
                                   'mathtext.fontset' :'cm',
                                   'axes.unicode_minus' : False })


'''
BRS 2020 data
'''

fname = 'qbs/BRS_Full_Data_Set_9.30.xlsx'
xl = pd.ExcelFile(dataloc + fname, engine='openpyxl')
print(xl.sheet_names)
df = []
for sheet in ['Export Worksheet1', 'Export Worksheet2']:
    df.append(xl.parse(sheet))


df = pd.concat(df, ignore_index=True)

df['response_num'] = pd.to_numeric(df['RESPONSE'], errors='coerce')

cond = (df['COMPLETED_DATE'].notnull())
df = df[cond]

###only care about response to last question
kcond = (df['QUESTION_ID']==10)
df = df[kcond]

###########
##brs info
dfbrs = opensplitnew('qbs/BRS_info', ['UNIT_NUMBER', 'ldb_num', 'ein'])

mc = ['UNIT_NUMBER']
df = merging(df, dfbrs, mc, mc, True, 'left')

##employment
yr = 2019
qtr = ''
fips = ''
own = 5
cols = ['aaemp', 'ldb_num', 'ein']
dfldb = qcew(yr = yr,qtr = qtr,fips = fips,own = own, cols  =cols)
dfldb['empmax_19'] = dfldb[['emp_m'+str(i) for i in range(1,13)]].to_pandas().max(axis=1)
dfldb = dfldb.groupby('ein', as_index=False)['empmax_19'].sum().reset_index().to_pandas()
tdict = dict(zip(dfldb['ein'], dfldb['empmax_19']))
dfldb = None
df['empmax_19'] = df['ein'].map(tdict)

dfo = df.copy()

###PPP info
mc = ['ldb_num']
###ldb specific info
dfp = opennew('pppfiles/pppagg2020', ['LoanAmount', 'DateApproved',  'ldb_num', 'fuzzy_type', 'fuzzy_group', 'ldb_cosine_fuzzy']).drop_duplicates(mc).to_pandas()
df = df.merge(dfp, on = mc, how = 'left')

mc = ['ein']
rdict = {
    c : c + '_ein' for c in [ 'fuzzy_type', 'fuzzy_group', 'ldb_cosine_fuzzy']
}
dfp = opennew('pppfiles/pppagg2020', ['LoanAmount_ein', 'DateApproved_ein',  'ein', 'fuzzy_type', 'fuzzy_group', 'ldb_cosine_fuzzy']).drop_duplicates(mc).rename(columns = rdict).to_pandas()
df = df.merge(dfp, on = mc, how = 'left')
dfp = None


cond = (df['LoanAmount'] > 0)
df['Dppp'] = np.where(cond, 1, 0)


cond = (df['LoanAmount_ein'] > 0)
df['Dppp_ein'] = np.where(cond, 1, 0)

df['amt_peremp_ein'] = df['LoanAmount_ein'] / df['empmax_19']
cond = (df['amt_peremp_ein'] < 500)
df['LoanAmount_ein'] = np.where(cond, np.NaN, df['LoanAmount_ein'])
df['DateApproved_ein'] = np.where(cond, np.NaN, df['DateApproved_ein'])

###get amount and date approved
df['approval_date'] = pd.to_datetime(df['DateApproved_ein'].fillna(df['DateApproved']), errors='coerce')
df['amount_final'] = df['LoanAmount_ein'].fillna(df['LoanAmount']).fillna(0)

cond = (df['amount_final'] > 0)
df['Dppp_final'] = np.where(cond, 1, 0)

df['Dppp_date'] = df[['COMPLETED_DATE', 'approval_date']].apply(lambda x: 1*(pd.to_datetime(x[0]) > pd.to_datetime(x[1])) , axis=1)



df['fuzzy_type'] = df['fuzzy_type'].fillna(df['fuzzy_type_ein']).fillna('No Match')
df['fuzzy_group'] = df['fuzzy_group'].fillna(df['fuzzy_group_ein']).fillna('No Match')
df['ldb_cosine_fuzzy'] = df['ldb_cosine_fuzzy'].fillna(df['ldb_cosine_fuzzy_ein'])
##replace if PPP date not lining up right
cond = (df['Dppp_date']==False)
for c in ['fuzzy_type', 'fuzzy_group']:
    df[c] = np.where(cond, 'No Match', df[c])


cond = (df['CHOICE_TEXT'] == 'Yes')
conda = (df['CHOICE_TEXT'].isin(['Yes', 'No']))
df['Dq10yes'] = np.where(conda,np.where(cond,1,0), np.NaN)


print(df.groupby('Dppp_date')['Dq10yes'].mean())

bins = [-9e9,.7,.8,.9,.9999999, 9e9]
labs = ['Lowest', 'Low', 'Medium', 'High', 'Exact']
df['fuzz_score'] = pd.cut(df['ldb_cosine_fuzzy'], bins=bins, labels=labs)

df.groupby(['fuzzy_type', 'fuzzy_group','fuzz_score'])['Dq10yes'].mean()
dft1 = df.groupby(['fuzzy_type', 'fuzzy_group','fuzz_score'])['Dq10yes'].mean()
dft2 = df.groupby(['fuzzy_type', 'fuzzy_group','fuzz_score'])['Dq10yes'].count()

tdict1 = {
    'a' : 'City',
'b' : 'County',
'c' : 'Commuting Zone',
}
rlist = []
## in category
tloc = ('Exact Address Match', 'a', 'Exact')
v1 = int(dft2.loc[tloc])
#mean
v2 = round(100*dft1.loc[tloc], 1)
rlist.append(f'Exact Match & Address & {tloc[2]} & {v1} & {v2}\%')

tloc = ('Address Match', 'a', 'High')
v1 = int(dft2.loc[tloc])
#mean
v2 = round(100*dft1.loc[tloc], 1)
rlist.append(f'Fuzzy Match & Address & - & {v1} & {v2}\%')


for tk in tdict1.keys():
    tloc = ('Exact Match', tk, 'Exact')
    v1 = int(dft2.loc[tloc])
    #mean
    v2 = round(100*dft1.loc[tloc], 1)
    rlist.append(f'{tloc[0]} & {tdict1[tloc[1]]} & {tloc[2]} & {v1} & {v2}\%')


    tloc = ('Fuzzy Match', tk, 'High')
    v1 = int(dft2.loc[tloc])
    #mean
    v2 = round(100*dft1.loc[tloc], 1)
    rlist.append(f'{tloc[0]} & {tdict1[tloc[1]]} & {tloc[2]} & {v1} & {v2}\%')

    tloc = ('Fuzzy Match', tk, 'Medium')
    v1 = int(dft2.loc[tloc])
    #mean
    v2 = round(100*dft1.loc[tloc], 1)
    rlist.append(f'{tloc[0]} & {tdict1[tloc[1]]} & {tloc[2]} & {v1} & {v2}\%')

    tloc = ('Fuzzy Match', tk, 'Low')
    v1 = int(dft2.loc[tloc])
    #mean
    v2 = round(100*dft1.loc[tloc], 1)
    rlist.append(f'{tloc[0]} & {tdict1[tloc[1]]} & {tloc[2]} & {v1} & {v2}\%')

    tloc = ('Fuzzy Match', tk, 'Lowest')
    v1 = int(dft2.loc[tloc])
    #mean
    v2 = round(100*dft1.loc[tloc], 1)
    rlist.append(f'{tloc[0]} & {tdict1[tloc[1]]} & {tloc[2]} & {v1} & {v2}\%')



v1 = int(df[df['fuzzy_type']=='No Match']['Dq10yes'].count())
#mean
v2 = round(100*df[df['fuzzy_type']=='No Match']['Dq10yes'].mean(), 1)
rlist.append("\multicolumn{3}{l}{BRS Respondents with no PPP Match}" + f'  & {v1} & {v2}\%')

print((r"\\" + '\n').join(rlist)+ r'\\')

'''
********************************************************************************
********************************************************************************
match rate
********************************************************************************
********************************************************************************
'''
dfp = cudf.read_csv(dataloc + 'pppfiles/ppp2020matched.csv', sep = '|')
#only keep if from 2020
dfp['loan_year'] = pd.to_datetime(dfp['DateApproved'].to_pandas()).dt.year
dfp = dfp[dfp['loan_year']==2020]
dfp = pppdropcond(dfp)
# keep if NOT in these type of businesses
kcond = (dfp['BusinessType'].isin(['Self-Employed Individuals',
                                   'Independent Contractors', 'Sole Proprietorship']))
# keep if NOT non-profit with naics of religious org
kcond = (kcond | ((dfp['BusinessType'] == 'Non-Profit Organization') & (dfp['NAICSCode'] == 813110)))
dfp['Doos'] = np.where(kcond.to_pandas(), 1, 0)
##drop dups
dfp.drop_duplicates('ID', inplace=True)
##
dfp[['Doos', 'fuzzy_type', 'Ddropcond']].to_pandas().groupby(['Doos', 'Ddropcond'])['fuzzy_type'].value_counts() / len(dfp)

dfp[['Doos', 'fuzzy_type', 'Ddropcond', 'LoanAmount']].to_pandas().groupby(['Doos', 'Ddropcond', 'fuzzy_type'])['LoanAmount'].sum() / dfp['LoanAmount'].sum()

###match rate table
###row 1
#total loans
v1 = round(len(dfp)/1e6,1)
#amount
v2 = int(round(dfp['LoanAmount'].sum()/1e9,0))
#% match
cond = (dfp['Ddropcond'] == 0) & (dfp['fuzzy_type'] != 'No Match')
v3 = round(100 * (len(dfp[cond]) / len(dfp)), 1)
#% amount match
v4 = round(100 * (dfp[cond]['LoanAmount'].sum() / dfp['LoanAmount'].sum()), 1)
r1 = f'All Loans & {v1} & {v2} & {v3} & {v4} \\\\ \n'
###row 2
cond1 = (dfp['Doos']==0)
#total loans
v1 = round(len(dfp[cond1])/1e6,1)
#amount
v2 = int(round(dfp[cond1]['LoanAmount'].sum()/1e9,0))
#% match
cond = (dfp['Ddropcond'] == 0) & (dfp['fuzzy_type'] != 'No Match')
v3 = round(100 * (len(dfp[cond&cond1]) / len(dfp[cond1])), 1)
#% amount match
v4 = round(100 * (dfp[cond&cond1]['LoanAmount'].sum() / dfp[cond1]['LoanAmount'].sum()), 1)
r2 = "\hspace{.1in}" + f'Out of Scope Business Types & {v1} & {v2} & {v3} & {v4} \\\\ \n'

###row 3
cond2 = (dfp['JobsReported'] > 1)
#total loans
v1 = round(len(dfp[cond1 & cond2])/1e6,1)
#amount
v2 = int(round(dfp[cond1 & cond2]['LoanAmount'].sum()/1e9,0))
#% match
cond = (dfp['Ddropcond'] == 0) & (dfp['fuzzy_type'] != 'No Match')
v3 = round(100 * (len(dfp[cond&cond1 & cond2]) / len(dfp[cond1 & cond2])), 1)
#% amount match
v4 = round(100 * (dfp[cond&cond1 & cond2]['LoanAmount'].sum() / dfp[cond1 & cond2]['LoanAmount'].sum()), 1)
r3 = "\hspace{.1in}" + f'Out of Scope + Reporting Only 1 Job Saved& {v1} & {v2} & {v3} & {v4} \\\\ \n'


t2 = r'''After removing...&&&&\\''' + '\n'

print(r1 + t2 + r2 + r3 + r'')

'''
##################################################
##################################################
matchtype
##################################################
##################################################
'''

roworder = ['Exact Address Match',
            'Address Match',
            'Exact Match,\nCity',
            'Exact Match,\nCounty',
            'Exact Match,\nCommuting Zone',
        'Fuzzy Match,\nCity',
            'Fuzzy Match,\nCounty',
            'Fuzzy Match,\nCommuting Zone',
        'No Match'
        ]

'''
graph by match type
'''
ftypedict = {
    'a' : ',\nCity',
    'b' : ',\nCounty',
    'c' : ',\nCommuting Zone',
    'No Match' : ''
}
dfp[['fuzzy_type', 'fuzzy_group']] = dfp[['fuzzy_type', 'fuzzy_group']].fillna('No Match')

dfp['fuzz_typegroup'] = dfp[['fuzzy_type', 'fuzzy_group']].to_pandas().apply(lambda x: x[0] + ftypedict[x[1]], axis=1)
cond = (dfp['fuzz_typegroup'].str.contains('Exact Address')).to_array()
cond2 = (dfp['fuzz_typegroup'].str.startswith('Address')).to_array()
dfp['fuzz_typegroup'] = np.where(cond, ['Exact Address Match']*len(cond), np.where(cond2, ['Address Match']*len(cond), dfp['fuzz_typegroup'].to_array()))

###if drop condition, treat these as no match
cond = (dfp['Ddropcond'] == 1).to_pandas()
dfp['fuzz_typegroup'] = np.where(cond, 'No Match', dfp['fuzz_typegroup'].to_pandas())

cond = ((dfp['fuzzy_type'].isin(['No Match'])) | (dfp['fuzzy_type'].isnull())).to_array()
dfp['unity'] = 1
dfp['unity2'] = 1
dfp['Dmatch_wgt'] = np.where(cond, [0]*len(cond), dfp['LoanAmount'].to_array())

aggstats = {
    'unity' : 'sum',
    'Dmatch_wgt' : 'sum',
    'LoanAmount' : 'sum',
    'unity2' : 'count'
}

mcond = (dfp['Doos'] == 0)
df1 = dfp[mcond].groupby('fuzz_typegroup').agg(aggstats).to_pandas()
df1 = df1.reindex(reversed(roworder))
df1['Dmatch_wgt'] = df1['LoanAmount'] / df1['LoanAmount'].sum()
df1['Dmatch'] = df1['unity'] / df1['unity2'].sum()

graphnum = 0
vardict = {
    'Dmatch' : 'Distribution of Loans Matched,\n Based on Match Type',
'Dmatch_wgt' : 'Distribution of Loan Amount Matched,\n Based on Match Type',
}

#bar graphs
for var in ['Dmatch', 'Dmatch_wgt']:
    graphnum = graphnum + 1
    numcols = 1
    numrows = 1
    f, ax = plt.subplots(figsize=(7 * numcols, 6 * numrows),constrained_layout=True)
    #ax.set_xticklabels(df1.index, Rotation=90)
    ax.barh(df1.index, df1[var])
    ax.set_title(vardict[var], fontsize=15)  # for title
    ax.set_ylabel("Type of Match", fontsize=10)  # label for x-axis
    ax.set_xlabel("Proportion", fontsize=10)  # label for y-axis
    #ax.legend(loc='best', fontsize='small')
    plt.savefig(resultsloc + filename + '_graph_' +var + 'matchtype' + '.pdf')



'''
********************************************************************************
********************************************************************************
box graphs
********************************************************************************
********************************************************************************
'''


'''
********************************************************************************
employment ratio
********************************************************************************
'''
yr = 2019
qtr = ''
fips = ''
own = 5
cols = ['aaemp', 'wages', 'ldb_num']
dfldb = qcew(yr = yr,qtr = qtr,fips = fips,own = own, cols  =cols)
dfldb['empmax_19'] = dfldb[['emp_m'+str(i) for i in range(1,13)]].to_pandas().max(axis=1)
dfldb['wagemax_19'] = dfldb[['wages_'+str(i) for i in range(1,5)]].to_pandas().max(axis=1) * 4
mc = ['ldb_num']
dfp = dfp.merge(dfldb[mc + ['empmax_19', 'wagemax_19']], on = mc, how = 'inner')

dfp['jobs_emp_ratio'] = dfp['JobsReported'] / dfp['empmax_19']
dfp['jobs_emp_ratio'] = dfp['jobs_emp_ratio'].replace([np.inf], np.NaN)

var = 'jobs_emp_ratio'

vardict = {
    'jobs_emp_ratio' : 'Box Plot of Ratio of Jobs Reported\n in PPP to Emp. in 2019 QCEW',
}

tdict = {j:i for i,j in enumerate(roworder)}
dfp['fuzz_typegroup_num'] = dfp['fuzz_typegroup'].map(tdict)

graphnum = graphnum + 1
numcols = 1
numrows = 1
cond = (dfp['fuzz_typegroup'] != 'No Match') & (dfp['Ddropcond'] == 0)
f, ax = plt.subplots(figsize=(8 * numcols, 6.5 * numrows),constrained_layout=True)
dfp.loc[cond ][[var, 'fuzz_typegroup_num']].to_pandas().boxplot(by='fuzz_typegroup_num', showfliers=False, grid=False, ax=ax)
f.suptitle('')
ax.set_xticklabels(roworder[:-1], Rotation=90)
#plotting ratio of 1
plt.axhline(y = 1, color = 'lightskyblue', linestyle = '--', label = "PPP Jobs Saved = 2019 Employment")
ax.set_title(vardict[var], fontsize=15)  # for title
ax.set_xlabel("Type of Match", fontsize=10)  # label for x-axis
ax.set_ylabel("", fontsize=10)  # label for y-axis
f.subplots_adjust(bottom=0.3)
ax.legend(loc='best', fontsize='small')
plt.savefig(resultsloc + filename + '_graph_' +var  + '.pdf')



'''
********************************************************************************
loan to wage
********************************************************************************
'''


dfp['loan_wage_ratio'] = dfp['LoanAmount'] / dfp['wagemax_19']
dfp['loan_wage_ratio'] = dfp['loan_wage_ratio'].replace([np.inf], np.NaN)

var = 'loan_wage_ratio'


vardict = {
    'loan_wage_ratio' : 'Box Plot of Ratio of Loan Amount\n in PPP to Wages in 2019 QCEW',
}


graphnum = graphnum + 1
numcols = 1
numrows = 1
cond = (dfp['fuzz_typegroup'] != 'No Match')
f, ax = plt.subplots(figsize=(8 * numcols, 6.5 * numrows),constrained_layout=True)
dfp.loc[cond ][[var, 'fuzz_typegroup_num']].to_pandas().boxplot(by='fuzz_typegroup_num', showfliers=False, grid=False, ax=ax)
f.suptitle('')
ax.set_xticklabels(roworder[:-1], Rotation=90)
#plotting ratio of 1
tenweeks = 10/52
plt.axhline(y = tenweeks, color = 'lightskyblue',  linestyle = '--', label = "10 Weeks of 2019 Wages")
ax.set_title(vardict[var], fontsize=15)  # for title
ax.set_xlabel("Type of Match", fontsize=10)  # label for x-axis
ax.set_ylabel("", fontsize=10)  # label for y-axis
f.subplots_adjust(bottom=0.3)
ax.legend(loc='best', fontsize='small')
plt.savefig(resultsloc + filename + '_graph_' +var  + '.pdf')



'''
******************************************************************************
******************************************************************************
takeup rate
******************************************************************************
******************************************************************************
'''
df = cudf.read_csv(dataloc + 'pppfiles/' + 'ANppp_ldbv1' + '_step1.csv')
df = df[(df['aaemp_19']>0) & (df['emp_jan_feb20']>0)]
###needed for cutoff number
df['aaemp'] = df['aaemp_19'].copy()
df['ui_aaemp'] = df['ein_aaemp_19'].copy()
##franchise
einlist = cudf.read_csv(dataloc + 'pppfiles/sba_franchise_fullymerged.csv', sep='|')[
    'ein'].unique().to_pandas().tolist()
tdict = {i : 1 for i in einlist}
df['Dfranchise'] = df['ein'].map(tdict).fillna(0)
einlist = None
tdict = None
df = industrycutoffs(df)


mc = ['ldb_num']
###ldb specific info
dfp = opennew('pppfiles/pppagg2020', ['LoanAmount', 'DateApproved',  'ldb_num']).drop_duplicates(mc)
df = df.merge(dfp, on = mc, how = 'left')
dfp = None


mc = ['ein']
dfp = opennew('pppfiles/pppagg2020', ['LoanAmount_ein', 'DateApproved_ein',  'ein']).drop_duplicates(mc)
df = df.merge(dfp, on = mc, how = 'left')
dfp = None


conda = (df['cutoff_score'].notnull()).to_pandas()
cond = (df['cutoff_score'] <= 1).to_pandas()
df['Deligible'] = np.where(conda, np.where(cond,1, 0), np.NaN)


cond = (df['LoanAmount'] > 0).to_pandas()
df['Dppp'] = np.where(cond, 1, 0)

cond = (df['LoanAmount_ein'] > 0).to_pandas()
df['Dppp_ein'] = np.where(cond, 1, 0)

df['amt_peremp_ein'] = df['LoanAmount_ein'] / df['ein_aaemp_19']
cond = (df['amt_peremp_ein'] < 500).to_pandas()
df['LoanAmount_ein'] = np.where(cond, np.NaN, df['LoanAmount_ein'].to_pandas())
df['DateApproved_ein'] = np.where(cond, np.NaN, df['DateApproved_ein'].to_pandas())

###get amount and date approved
df['approval_date'] = pd.to_datetime(df['DateApproved_ein'].to_pandas().fillna(df['DateApproved'].to_pandas()), errors='coerce')
df['amount_final'] = df['LoanAmount_ein'].to_pandas().fillna(df['LoanAmount'].to_pandas()).fillna(0)

cond = (df['amount_final'] > 0).to_pandas()
df['Dppp_final'] = np.where(cond, 1, 0)


print(df.groupby('Deligible')['Dppp'].mean())
print(df.groupby('Deligible')['Dppp_final'].mean())
cond1 = (df['Dppp_final'] == 1)
cond0 = (df['Deligible'] == 1)
print(df[ cond0 & cond1]['aaemp_19'].sum() / df[cond0]['aaemp_19'].sum())
print(df[ cond0 & cond1]['aaemp_19'].count() / df[cond0]['aaemp_19'].count())

# bins = [-9e9,1,5,10,25,50,250,9e9]
# labs = ['0-1', '1-5', '5-10', '10-25', '25-50', '50-250', '250+']
# df['empmax_bins'] = pd.cut(df['emp_max_19'].to_pandas(), bins=bins, labels=labs)
# df.groupby('empmax_bins')['Dppp_final'].mean()
# df.groupby(['Deligible', 'empmax_bins'])['Dppp_final'].mean()

##################################################
##################################################
##### census pulse comparability
##################################################
##################################################
df['naics2_test'] = df['naics'].astype('str').str[:2]
df['naics3'] = df['naics'].astype('str').str[:3]
#industry cutoffs
cond = ((df['naics3'].to_pandas().isin(['482', '491', '111', '112', '521', '525', '813', '814'])) | (df['naics2_test'].to_pandas().isin(['92'])) | (df['naics'].to_pandas() == 110000))
df['censuspulseind'] = np.where(cond, 0, 1)
logging.exception('% of estab receiving - Census Pulse definition')
# & (df['Deligible'] == 1)
cond = (df['aaemp_19'] < 500) & (df['ein_aaemp_19'] == df['aaemp_19'])& (df['censuspulseind'] == 1)
print(df[cond]['Dppp_final'].mean())
cond1 = (df['Dppp_final'] == 1)
logging.exception('% of eligible employment receiving')
print(df[cond & cond1]['aaemp_19'].sum() / df[cond ]['aaemp_19'].sum())
print(df[cond & cond1]['aaemp_19'].count() / df[cond ]['aaemp_19'].count())

##################################################
##################################################
##### census sbus comparability (autor 2022)
##################################################
##################################################
#industry cutoffs
cond = ((df['naics3'].to_pandas().isin(['482', '491', '111', '112' '814'])) | (df['naics2_test'].to_pandas().isin(['92'])) | (df['naics'].to_pandas().isin([525110,525120,525190,541120])))
df['susbind'] = np.where(cond, 0, 1)
cond1 = (df['susbind'] == 1).to_pandas()
print(df[cond1 & (df['ein_aaemp_19'] < 500).to_pandas()].groupby('Deligible')['aaemp_19'].sum())
print(df[cond1 & (df['ein_aaemp_19'] >= 500).to_pandas()].groupby('Deligible')['aaemp_19'].sum())

###remove loans to get susb loan estimates
dfp2 = cudf.read_csv(dataloc + 'pppfiles/ppp2020matched.csv', sep = '|')[['naics_code', 'JobsReported', 'fuzzy_type', 'BusinessType', 'employer_edit', 'employer_fuzzy', 'ldb_cosine_fuzzy', 'fuzzy_group',
                                                                          ]]
dfp2 = pppdropcond(dfp2)

dfp2['naics2_test'] = dfp2['naics_code'].astype('str').str[:2]
dfp2['naics3'] = dfp2['naics_code'].astype('str').str[:3]
cond = ((dfp2['naics3'].to_pandas().isin(['482', '491', '111', '112' '814'])) | (dfp2['naics2_test'].to_pandas().isin(['92'])) | (dfp2['naics_code'].to_pandas().isin([525110,525120,525190,541120])) | (dfp2['BusinessType'].to_pandas().isin(['Self-Employed Individuals',
                                   'Independent Contractors', 'Sole Proprietorship'])))
dfp2['susbind'] = np.where(cond, 0, 1)
cond1 = (dfp2['susbind'] == 1).to_pandas()

print(dfp2[cond1]['JobsReported'].sum())

#industry cutoffs
cond = ((dfp2['naics3'].to_pandas().isin(['482', '491', '111', '112', '521', '525', '813', '814'])) | (dfp2['naics2_test'].to_pandas().isin(['92'])) | (dfp2['naics_code'].to_pandas() == 110000)  | (dfp2['BusinessType'].to_pandas().isin(['Self-Employed Individuals',
                                   'Independent Contractors', 'Sole Proprietorship'])))
dfp2['censuspulseind'] = np.where(cond, 0, 1)

cond1 = (dfp2['censuspulseind']==1) & ((dfp2['Ddropcond'] == 1) | (dfp2['fuzzy_type']=='No Match')) & (dfp2['JobsReported'] > 1)
dfp2['unity']=1

#####oos
# keep if NOT in these type of businesses
kcond = (dfp2['BusinessType'].isin(['Self-Employed Individuals',
                                   'Independent Contractors', 'Sole Proprietorship']))
# keep if NOT non-profit with naics of religious org
kcond = (kcond | ((dfp2['BusinessType'] == 'Non-Profit Organization') & (dfp2['naics_code'] == 813110)))
dfp2['Doos'] = np.where(kcond.to_pandas(), 1, 0)

###jobs & loans in-scope that are unmatched htat could be matched
print(dfp2[cond1][['JobsReported', 'unity']].sum())



cond = ((df['naics2_test'] == '72') | (df['Dfranchise'] == 1))
df['Dsingle'] = np.where(cond.to_pandas(), 1,0)


print(df[cond].groupby('Deligible')['Dppp_final'].mean())


cond = ((df['naics2_test'] == '72'))
df['D72'] = np.where(cond.to_pandas(), 1,0)

df[cond].groupby('Deligible')['Dppp_final'].mean()


'''
******************************************
******************************************

closures
******************************************
******************************************
'''

df = cudf.read_csv(dataloc + 'pppfiles/' + 'ANppp_ldbv1' + '_step1.csv')
df = df[(df['aaemp_19']>0) & (df['emp_jan_feb20']>0)]
ldblist = df['ldb_num'].unique().to_pandas().tolist()

yr = 2021
qtr = ''
fips = ''
own = 5
cols = ['ldb_num', 'aaemp']
dfldb = qcew(yr=yr,qtr=qtr,fips=fips,own=own,cols=cols)[['ldb_num', 'emp_m12', 'emp_m12flag']]
dfldb = dfldb[dfldb['ldb_num'].isin(ldblist)]
dfldb[dfldb['emp_m12flag']=='N']['emp_m12'].describe()

##number of 0s
num0 = len(dfldb[(dfldb['emp_m12']==0) | (dfldb['emp_m12'].isnull())])
numpos = len(dfldb[(dfldb['emp_m12']>0)])
numposb = len(dfldb[(dfldb['emp_m12']>0) | (dfldb['emp_m12'].isnull())])
totobs =len(ldblist)
##number with pos employment
numpos/totobs
####use this number
######percent of those open in 2019/begin 2020 that have positive employ or missing in december 2021
print(numposb/totobs)